TFG Codigo

Author

Diego Brito

Librerias

library(MASS)
library(tidyverse)
library(readr)
library(psych)
library(ggplot2)
library(dplyr)
library(corrplot)
library(RColorBrewer)
library(gridExtra)
library(caret)
library(pROC)
library(car)

# library(MXM)
# library(parallel)
# library(doParallel)

Base de datos

setwd("C:\\Users\\diego\\OneDrive\\Escritorio\\UCM\\Cuarto\\Segundo Cuatri")
datos <- read.csv(file = "application_data.csv")

Depuracion de datos

primero vemos cuantas observaciones faltantes hay por columna

data.frame(sort(colSums(is.na(datos))))
                             sort.colSums.is.na.datos...
SK_ID_CURR                                             0
TARGET                                                 0
NAME_CONTRACT_TYPE                                     0
CODE_GENDER                                            0
FLAG_OWN_CAR                                           0
FLAG_OWN_REALTY                                        0
CNT_CHILDREN                                           0
AMT_INCOME_TOTAL                                       0
AMT_CREDIT                                             0
NAME_TYPE_SUITE                                        0
NAME_INCOME_TYPE                                       0
NAME_EDUCATION_TYPE                                    0
NAME_FAMILY_STATUS                                     0
NAME_HOUSING_TYPE                                      0
REGION_POPULATION_RELATIVE                             0
DAYS_BIRTH                                             0
DAYS_EMPLOYED                                          0
DAYS_REGISTRATION                                      0
DAYS_ID_PUBLISH                                        0
FLAG_MOBIL                                             0
FLAG_EMP_PHONE                                         0
FLAG_WORK_PHONE                                        0
FLAG_CONT_MOBILE                                       0
FLAG_PHONE                                             0
FLAG_EMAIL                                             0
OCCUPATION_TYPE                                        0
REGION_RATING_CLIENT                                   0
REGION_RATING_CLIENT_W_CITY                            0
WEEKDAY_APPR_PROCESS_START                             0
HOUR_APPR_PROCESS_START                                0
REG_REGION_NOT_LIVE_REGION                             0
REG_REGION_NOT_WORK_REGION                             0
LIVE_REGION_NOT_WORK_REGION                            0
REG_CITY_NOT_LIVE_CITY                                 0
REG_CITY_NOT_WORK_CITY                                 0
LIVE_CITY_NOT_WORK_CITY                                0
ORGANIZATION_TYPE                                      0
FONDKAPREMONT_MODE                                     0
HOUSETYPE_MODE                                         0
WALLSMATERIAL_MODE                                     0
EMERGENCYSTATE_MODE                                    0
FLAG_DOCUMENT_2                                        0
FLAG_DOCUMENT_3                                        0
FLAG_DOCUMENT_4                                        0
FLAG_DOCUMENT_5                                        0
FLAG_DOCUMENT_6                                        0
FLAG_DOCUMENT_7                                        0
FLAG_DOCUMENT_8                                        0
FLAG_DOCUMENT_9                                        0
FLAG_DOCUMENT_10                                       0
FLAG_DOCUMENT_11                                       0
FLAG_DOCUMENT_12                                       0
FLAG_DOCUMENT_13                                       0
FLAG_DOCUMENT_14                                       0
FLAG_DOCUMENT_15                                       0
FLAG_DOCUMENT_16                                       0
FLAG_DOCUMENT_17                                       0
FLAG_DOCUMENT_18                                       0
FLAG_DOCUMENT_19                                       0
FLAG_DOCUMENT_20                                       0
FLAG_DOCUMENT_21                                       0
DAYS_LAST_PHONE_CHANGE                                 1
CNT_FAM_MEMBERS                                        2
AMT_ANNUITY                                           12
AMT_GOODS_PRICE                                      278
EXT_SOURCE_2                                         660
OBS_30_CNT_SOCIAL_CIRCLE                            1021
DEF_30_CNT_SOCIAL_CIRCLE                            1021
OBS_60_CNT_SOCIAL_CIRCLE                            1021
DEF_60_CNT_SOCIAL_CIRCLE                            1021
AMT_REQ_CREDIT_BUREAU_HOUR                         41519
AMT_REQ_CREDIT_BUREAU_DAY                          41519
AMT_REQ_CREDIT_BUREAU_WEEK                         41519
AMT_REQ_CREDIT_BUREAU_MON                          41519
AMT_REQ_CREDIT_BUREAU_QRT                          41519
AMT_REQ_CREDIT_BUREAU_YEAR                         41519
EXT_SOURCE_3                                       60965
TOTALAREA_MODE                                    148431
YEARS_BEGINEXPLUATATION_AVG                       150007
YEARS_BEGINEXPLUATATION_MODE                      150007
YEARS_BEGINEXPLUATATION_MEDI                      150007
FLOORSMAX_AVG                                     153020
FLOORSMAX_MODE                                    153020
FLOORSMAX_MEDI                                    153020
LIVINGAREA_AVG                                    154350
LIVINGAREA_MODE                                   154350
LIVINGAREA_MEDI                                   154350
ENTRANCES_AVG                                     154828
ENTRANCES_MODE                                    154828
ENTRANCES_MEDI                                    154828
APARTMENTS_AVG                                    156061
APARTMENTS_MODE                                   156061
APARTMENTS_MEDI                                   156061
ELEVATORS_AVG                                     163891
ELEVATORS_MODE                                    163891
ELEVATORS_MEDI                                    163891
NONLIVINGAREA_AVG                                 169682
NONLIVINGAREA_MODE                                169682
NONLIVINGAREA_MEDI                                169682
EXT_SOURCE_1                                      173378
BASEMENTAREA_AVG                                  179943
BASEMENTAREA_MODE                                 179943
BASEMENTAREA_MEDI                                 179943
LANDAREA_AVG                                      182590
LANDAREA_MODE                                     182590
LANDAREA_MEDI                                     182590
OWN_CAR_AGE                                       202929
YEARS_BUILD_AVG                                   204488
YEARS_BUILD_MODE                                  204488
YEARS_BUILD_MEDI                                  204488
FLOORSMIN_AVG                                     208642
FLOORSMIN_MODE                                    208642
FLOORSMIN_MEDI                                    208642
LIVINGAPARTMENTS_AVG                              210199
LIVINGAPARTMENTS_MODE                             210199
LIVINGAPARTMENTS_MEDI                             210199
NONLIVINGAPARTMENTS_AVG                           213514
NONLIVINGAPARTMENTS_MODE                          213514
NONLIVINGAPARTMENTS_MEDI                          213514
COMMONAREA_AVG                                    214865
COMMONAREA_MODE                                   214865
COMMONAREA_MEDI                                   214865

ahora tenemos que ver que hacemos con esas observaciones, hay 2 opciones, eliminar aquellas observaciones o sistituir los valores aplicando reglas sustitutivas

# Calcular el porcentaje de valores nulos por columna
null_datos_df <- datos |> 
  summarise(across(everything(), ~ sum(is.na(.)) * 100 / n()))  |> # control + shift + m 
  pivot_longer(cols = everything(), names_to = "Column_Name", values_to = "Null_Values_Percentage")

# Crear el gráfico de puntos
ggplot(null_datos_df, aes(x = reorder(Column_Name, -Null_Values_Percentage), y = Null_Values_Percentage)) +
  geom_point(color = "blue") +
  geom_hline(yintercept = 40, linetype = "dashed", color = "red") +  # Línea de referencia al 40%
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 7)) +
  labs(title = "Percentage of Missing Values in Application Data",
       x = "Columns",
       y = "Null Values Percentage")

Variables con mas de un 40 % de datos faltantes

# que columnas tienen mas del 40 % de sus datos missing o NA
# Filtrar columnas con 40% o más de valores nulos
# ponemos como limite un 40 % de datos faltantes, porque sistituir mas de un 40 - 50 % de datos faltantes 
# con la mediana o media no es buena idea teniendo tanto % de datos faltantes 

nullcol_40_application <- null_datos_df  |>  
  filter(Null_Values_Percentage >= 40)

# Mostrar el resultado
print(nullcol_40_application)
# A tibble: 45 × 2
   Column_Name                 Null_Values_Percentage
   <chr>                                        <dbl>
 1 OWN_CAR_AGE                                   66.0
 2 EXT_SOURCE_1                                  56.4
 3 APARTMENTS_AVG                                50.7
 4 BASEMENTAREA_AVG                              58.5
 5 YEARS_BEGINEXPLUATATION_AVG                   48.8
 6 YEARS_BUILD_AVG                               66.5
 7 COMMONAREA_AVG                                69.9
 8 ELEVATORS_AVG                                 53.3
 9 ENTRANCES_AVG                                 50.3
10 FLOORSMAX_AVG                                 49.8
# ℹ 35 more rows

Datos faltantes

cuantos datos faltantes tenemos por columna

categorical_columns <- c('NAME_CONTRACT_TYPE', 'CODE_GENDER', 'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE', 'WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'LIVE_CITY_NOT_WORK_CITY', 'REG_CITY_NOT_LIVE_CITY', 'REG_CITY_NOT_WORK_CITY', 'REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION', 'REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY')

contact_col <- c("FLAG_MOBIL", "FLAG_EMP_PHONE", "FLAG_WORK_PHONE", 
                 "FLAG_CONT_MOBILE", "FLAG_PHONE", "FLAG_EMAIL")

col_Doc <- c("FLAG_DOCUMENT_2", "FLAG_DOCUMENT_3", "FLAG_DOCUMENT_4", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_6",
             "FLAG_DOCUMENT_7", "FLAG_DOCUMENT_8", "FLAG_DOCUMENT_9", "FLAG_DOCUMENT_10", "FLAG_DOCUMENT_11",
             "FLAG_DOCUMENT_12", "FLAG_DOCUMENT_13", "FLAG_DOCUMENT_14", "FLAG_DOCUMENT_15", "FLAG_DOCUMENT_16",
             "FLAG_DOCUMENT_17", "FLAG_DOCUMENT_18", "FLAG_DOCUMENT_19", "FLAG_DOCUMENT_20", "FLAG_DOCUMENT_21")

ext <- c("EXT_SOURCE_1", "EXT_SOURCE_2", "EXT_SOURCE_3")
data.frame(sort(colSums(is.na(datos))))
                             sort.colSums.is.na.datos...
SK_ID_CURR                                             0
TARGET                                                 0
NAME_CONTRACT_TYPE                                     0
CODE_GENDER                                            0
FLAG_OWN_CAR                                           0
FLAG_OWN_REALTY                                        0
CNT_CHILDREN                                           0
AMT_INCOME_TOTAL                                       0
AMT_CREDIT                                             0
NAME_TYPE_SUITE                                        0
NAME_INCOME_TYPE                                       0
NAME_EDUCATION_TYPE                                    0
NAME_FAMILY_STATUS                                     0
NAME_HOUSING_TYPE                                      0
REGION_POPULATION_RELATIVE                             0
DAYS_BIRTH                                             0
DAYS_EMPLOYED                                          0
DAYS_REGISTRATION                                      0
DAYS_ID_PUBLISH                                        0
FLAG_MOBIL                                             0
FLAG_EMP_PHONE                                         0
FLAG_WORK_PHONE                                        0
FLAG_CONT_MOBILE                                       0
FLAG_PHONE                                             0
FLAG_EMAIL                                             0
OCCUPATION_TYPE                                        0
REGION_RATING_CLIENT                                   0
REGION_RATING_CLIENT_W_CITY                            0
WEEKDAY_APPR_PROCESS_START                             0
HOUR_APPR_PROCESS_START                                0
REG_REGION_NOT_LIVE_REGION                             0
REG_REGION_NOT_WORK_REGION                             0
LIVE_REGION_NOT_WORK_REGION                            0
REG_CITY_NOT_LIVE_CITY                                 0
REG_CITY_NOT_WORK_CITY                                 0
LIVE_CITY_NOT_WORK_CITY                                0
ORGANIZATION_TYPE                                      0
FONDKAPREMONT_MODE                                     0
HOUSETYPE_MODE                                         0
WALLSMATERIAL_MODE                                     0
EMERGENCYSTATE_MODE                                    0
FLAG_DOCUMENT_2                                        0
FLAG_DOCUMENT_3                                        0
FLAG_DOCUMENT_4                                        0
FLAG_DOCUMENT_5                                        0
FLAG_DOCUMENT_6                                        0
FLAG_DOCUMENT_7                                        0
FLAG_DOCUMENT_8                                        0
FLAG_DOCUMENT_9                                        0
FLAG_DOCUMENT_10                                       0
FLAG_DOCUMENT_11                                       0
FLAG_DOCUMENT_12                                       0
FLAG_DOCUMENT_13                                       0
FLAG_DOCUMENT_14                                       0
FLAG_DOCUMENT_15                                       0
FLAG_DOCUMENT_16                                       0
FLAG_DOCUMENT_17                                       0
FLAG_DOCUMENT_18                                       0
FLAG_DOCUMENT_19                                       0
FLAG_DOCUMENT_20                                       0
FLAG_DOCUMENT_21                                       0
DAYS_LAST_PHONE_CHANGE                                 1
CNT_FAM_MEMBERS                                        2
AMT_ANNUITY                                           12
AMT_GOODS_PRICE                                      278
EXT_SOURCE_2                                         660
OBS_30_CNT_SOCIAL_CIRCLE                            1021
DEF_30_CNT_SOCIAL_CIRCLE                            1021
OBS_60_CNT_SOCIAL_CIRCLE                            1021
DEF_60_CNT_SOCIAL_CIRCLE                            1021
AMT_REQ_CREDIT_BUREAU_HOUR                         41519
AMT_REQ_CREDIT_BUREAU_DAY                          41519
AMT_REQ_CREDIT_BUREAU_WEEK                         41519
AMT_REQ_CREDIT_BUREAU_MON                          41519
AMT_REQ_CREDIT_BUREAU_QRT                          41519
AMT_REQ_CREDIT_BUREAU_YEAR                         41519
EXT_SOURCE_3                                       60965
TOTALAREA_MODE                                    148431
YEARS_BEGINEXPLUATATION_AVG                       150007
YEARS_BEGINEXPLUATATION_MODE                      150007
YEARS_BEGINEXPLUATATION_MEDI                      150007
FLOORSMAX_AVG                                     153020
FLOORSMAX_MODE                                    153020
FLOORSMAX_MEDI                                    153020
LIVINGAREA_AVG                                    154350
LIVINGAREA_MODE                                   154350
LIVINGAREA_MEDI                                   154350
ENTRANCES_AVG                                     154828
ENTRANCES_MODE                                    154828
ENTRANCES_MEDI                                    154828
APARTMENTS_AVG                                    156061
APARTMENTS_MODE                                   156061
APARTMENTS_MEDI                                   156061
ELEVATORS_AVG                                     163891
ELEVATORS_MODE                                    163891
ELEVATORS_MEDI                                    163891
NONLIVINGAREA_AVG                                 169682
NONLIVINGAREA_MODE                                169682
NONLIVINGAREA_MEDI                                169682
EXT_SOURCE_1                                      173378
BASEMENTAREA_AVG                                  179943
BASEMENTAREA_MODE                                 179943
BASEMENTAREA_MEDI                                 179943
LANDAREA_AVG                                      182590
LANDAREA_MODE                                     182590
LANDAREA_MEDI                                     182590
OWN_CAR_AGE                                       202929
YEARS_BUILD_AVG                                   204488
YEARS_BUILD_MODE                                  204488
YEARS_BUILD_MEDI                                  204488
FLOORSMIN_AVG                                     208642
FLOORSMIN_MODE                                    208642
FLOORSMIN_MEDI                                    208642
LIVINGAPARTMENTS_AVG                              210199
LIVINGAPARTMENTS_MODE                             210199
LIVINGAPARTMENTS_MEDI                             210199
NONLIVINGAPARTMENTS_AVG                           213514
NONLIVINGAPARTMENTS_MODE                          213514
NONLIVINGAPARTMENTS_MEDI                          213514
COMMONAREA_AVG                                    214865
COMMONAREA_MODE                                   214865
COMMONAREA_MEDI                                   214865
# Convertir las columnas a factor (categóricas)
datos[categorical_columns] <- lapply(datos[categorical_columns], as.factor)

Factorizamos las variables contacto y otras que sean necesarias

datos <- datos %>%
  mutate(across(all_of(contact_col), as.factor)) %>%
  mutate(across(all_of(col_Doc), as.factor))

variables categoricas

con pocos datos faltantes (moda)

# Función para imputar valores faltantes con la moda
 imputar_moda <- function(x) {
   if (is.factor(x) | is.character(x)) {  # Verifica si es categórica
     moda <- names(sort(table(x), decreasing = TRUE))[1]  # Encuentra la moda
     x[is.na(x)] <- moda  # Reemplaza los NA con la moda
   }
   return(x)
 }
#categorical_columns <- c(categorical_columns,"AMT_INCOME_RANGE")
# Aplicar la función a todas las columnas categóricas
 datos[categorical_columns] <- lapply(datos[categorical_columns], imputar_moda)

variables numericas

para sustituir aquellas variables que son numericas y tienen una observacion faltante, haremos uso de la media.

distribucion_variables_numericas <- function(datos) {
  numeric_columns <- datos |> select_if(is.numeric) |> names()  # Selecciona las variables numéricas
  
  for (col in numeric_columns) {
    cat("\n-------------------------------------------------\n")
    cat("Distribución de la variable:", col, "\n")
    cat("-------------------------------------------------\n")
    
    print(summary(datos[[col]]))  # Resumen estadístico
    hist(datos[[col]], main = paste("Histograma de", col), col = "skyblue", border = "white", xlab = col)
    
    # Test de Kolmogorov-Smirnov para normalidad
    ks_test <- ks.test(datos[[col]], "pnorm", mean(datos[[col]], na.rm = TRUE), sd(datos[[col]], na.rm = TRUE))
    
    cat("\nTest de Kolmogorov-Smirnov para la normalidad:\n")
    print(ks_test)
    
    if (ks_test$p.value < 0.05) {
      cat("❌ La variable", col, "NO sigue una distribución normal (p <", ks_test$p.value, ")\n")
    } else {
      cat("✅ La variable", col, "SIGUE una distribución normal (p =", ks_test$p.value, ")\n")
    }
  }
}

# Llamada a la función
distribucion_variables_numericas(datos)

-------------------------------------------------
Distribución de la variable: SK_ID_CURR 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 100002  189146  278202  278181  367143  456255 


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.057265, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable SK_ID_CURR NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: TARGET 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.00000 0.08073 0.00000 1.00000 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.53579, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable TARGET NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: CNT_CHILDREN 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.4171  1.0000 19.0000 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.41858, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable CNT_CHILDREN NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_INCOME_TOTAL 
-------------------------------------------------
     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
    25650    112500    147150    168798    202500 117000000 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.30171, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_INCOME_TOTAL NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_CREDIT 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  45000  270000  513531  599026  808650 4050000 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.11015, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_CREDIT NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_ANNUITY 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   1616   16524   24903   27109   34596  258026      12 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.0789, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_ANNUITY NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_GOODS_PRICE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  40500  238500  450000  538396  679500 4050000     278 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.14269, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_GOODS_PRICE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: REGION_POPULATION_RELATIVE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00029 0.01001 0.01885 0.02087 0.02866 0.07251 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.11345, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable REGION_POPULATION_RELATIVE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: DAYS_BIRTH 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -25229  -19682  -15750  -16037  -12413   -7489 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.048582, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable DAYS_BIRTH NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: DAYS_EMPLOYED 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -17912   -2760   -1213   63815    -289  365243 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.49419, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable DAYS_EMPLOYED NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: DAYS_REGISTRATION 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 -24672   -7480   -4504   -4986   -2010       0 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.078483, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable DAYS_REGISTRATION NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: DAYS_ID_PUBLISH 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  -7197   -4299   -3254   -2994   -1720       0 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.12221, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable DAYS_ID_PUBLISH NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: OWN_CAR_AGE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    5.00    9.00   12.06   15.00   91.00  202929 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.16271, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable OWN_CAR_AGE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: CNT_FAM_MEMBERS 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  1.000   2.000   2.000   2.153   3.000  20.000       2 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.30217, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable CNT_FAM_MEMBERS NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: HOUR_APPR_PROCESS_START 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00   10.00   12.00   12.06   14.00   23.00 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.08234, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable HOUR_APPR_PROCESS_START NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: REG_REGION_NOT_LIVE_REGION 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.00000 0.01514 0.00000 1.00000 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.5342, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable REG_REGION_NOT_LIVE_REGION NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: EXT_SOURCE_1 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.01    0.33    0.51    0.50    0.68    0.96  173378 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.044677, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable EXT_SOURCE_1 NO sigue una distribución normal (p < 5.58411e-233 )

-------------------------------------------------
Distribución de la variable: EXT_SOURCE_2 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.3925  0.5660  0.5144  0.6636  0.8550     660 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.10691, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable EXT_SOURCE_2 NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: EXT_SOURCE_3 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.37    0.54    0.51    0.67    0.90   60965 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.061755, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable EXT_SOURCE_3 NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: APARTMENTS_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.06    0.09    0.12    0.15    1.00  156061 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.1668, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable APARTMENTS_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: BASEMENTAREA_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.04    0.08    0.09    0.11    1.00  179943 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.14167, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable BASEMENTAREA_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: YEARS_BEGINEXPLUATATION_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.98    0.98    0.98    0.99    1.00  150007 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.39064, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable YEARS_BEGINEXPLUATATION_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: YEARS_BUILD_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.69    0.76    0.75    0.82    1.00  204488 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.051642, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable YEARS_BUILD_AVG NO sigue una distribución normal (p < 4.560853e-239 )

-------------------------------------------------
Distribución de la variable: COMMONAREA_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.01    0.02    0.04    0.05    1.00  214865 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.27866, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable COMMONAREA_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: ELEVATORS_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.08    0.12    1.00  163891 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.3181, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable ELEVATORS_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: ENTRANCES_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.07    0.14    0.15    0.21    1.00  154828 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.19338, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable ENTRANCES_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: FLOORSMAX_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.17    0.17    0.23    0.33    1.00  153020 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.27317, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable FLOORSMAX_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: FLOORSMIN_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.08    0.21    0.23    0.38    1.00  208642 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.22705, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable FLOORSMIN_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LANDAREA_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.02    0.05    0.07    0.09    1.00  182590 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.20694, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LANDAREA_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LIVINGAPARTMENTS_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.05    0.08    0.10    0.12    1.00  210199 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.17467, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LIVINGAPARTMENTS_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LIVINGAREA_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.05    0.07    0.11    0.13    1.00  154350 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.18232, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LIVINGAREA_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: NONLIVINGAPARTMENTS_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.01    0.00    1.00  213514 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.42679, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable NONLIVINGAPARTMENTS_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: NONLIVINGAREA_AVG 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.03    0.03    1.00  169682 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.34168, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable NONLIVINGAREA_AVG NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: APARTMENTS_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.05    0.08    0.11    0.14    1.00  156061 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.17123, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable APARTMENTS_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: BASEMENTAREA_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.04    0.07    0.09    0.11    1.00  179943 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.14955, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable BASEMENTAREA_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: YEARS_BEGINEXPLUATATION_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.98    0.98    0.98    0.99    1.00  150007 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.39761, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable YEARS_BEGINEXPLUATATION_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: YEARS_BUILD_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.70    0.76    0.76    0.82    1.00  204488 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.054756, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable YEARS_BUILD_MODE NO sigue una distribución normal (p < 1.021391e-268 )

-------------------------------------------------
Distribución de la variable: COMMONAREA_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.01    0.02    0.04    0.05    1.00  214865 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.28379, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable COMMONAREA_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: ELEVATORS_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.07    0.12    1.00  163891 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.33652, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable ELEVATORS_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: ENTRANCES_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.07    0.14    0.15    0.21    1.00  154828 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.204, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable ENTRANCES_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: FLOORSMAX_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.17    0.17    0.22    0.33    1.00  153020 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.28906, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable FLOORSMAX_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: FLOORSMIN_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.08    0.21    0.23    0.38    1.00  208642 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.23649, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable FLOORSMIN_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LANDAREA_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.02    0.05    0.06    0.08    1.00  182590 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.21343, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LANDAREA_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LIVINGAPARTMENTS_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.05    0.08    0.11    0.13    1.00  210199 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.17894, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LIVINGAPARTMENTS_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LIVINGAREA_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.04    0.07    0.11    0.13    1.00  154350 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.19075, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LIVINGAREA_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: NONLIVINGAPARTMENTS_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.01    0.00    1.00  213514 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.43073, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable NONLIVINGAPARTMENTS_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: NONLIVINGAREA_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.03    0.02    1.00  169682 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.35025, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable NONLIVINGAREA_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: APARTMENTS_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.06    0.09    0.12    0.15    1.00  156061 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.16968, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable APARTMENTS_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: BASEMENTAREA_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.04    0.08    0.09    0.11    1.00  179943 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.14225, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable BASEMENTAREA_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: YEARS_BEGINEXPLUATATION_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.98    0.98    0.98    0.99    1.00  150007 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.39156, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable YEARS_BEGINEXPLUATATION_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: YEARS_BUILD_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.69    0.76    0.76    0.83    1.00  204488 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.051814, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable YEARS_BUILD_MEDI NO sigue una distribución normal (p < 1.165368e-240 )

-------------------------------------------------
Distribución de la variable: COMMONAREA_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.01    0.02    0.04    0.05    1.00  214865 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.27905, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable COMMONAREA_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: ELEVATORS_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.08    0.12    1.00  163891 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.32521, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable ELEVATORS_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: ENTRANCES_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.07    0.14    0.15    0.21    1.00  154828 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.19915, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable ENTRANCES_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: FLOORSMAX_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.17    0.17    0.23    0.33    1.00  153020 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.28113, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable FLOORSMAX_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: FLOORSMIN_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.08    0.21    0.23    0.38    1.00  208642 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.23289, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable FLOORSMIN_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LANDAREA_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.02    0.05    0.07    0.09    1.00  182590 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.20683, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LANDAREA_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LIVINGAPARTMENTS_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.05    0.08    0.10    0.12    1.00  210199 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.17714, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LIVINGAPARTMENTS_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: LIVINGAREA_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.05    0.07    0.11    0.13    1.00  154350 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.18396, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable LIVINGAREA_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: NONLIVINGAPARTMENTS_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.01    0.00    1.00  213514 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.42761, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable NONLIVINGAPARTMENTS_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: NONLIVINGAREA_MEDI 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.03    0.03    1.00  169682 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.34369, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable NONLIVINGAREA_MEDI NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: TOTALAREA_MODE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.04    0.07    0.10    0.13    1.00  148431 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.18429, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable TOTALAREA_MODE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: OBS_30_CNT_SOCIAL_CIRCLE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   0.000   0.000   1.422   2.000 348.000    1021 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.27681, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable OBS_30_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: DEF_30_CNT_SOCIAL_CIRCLE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
 0.0000  0.0000  0.0000  0.1434  0.0000 34.0000    1021 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.51118, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable DEF_30_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: OBS_60_CNT_SOCIAL_CIRCLE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   0.000   0.000   1.405   2.000 344.000    1021 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.27743, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable OBS_60_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: DEF_60_CNT_SOCIAL_CIRCLE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0     0.0     0.0     0.1     0.0    24.0    1021 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.52471, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable DEF_60_CNT_SOCIAL_CIRCLE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: DAYS_LAST_PHONE_CHANGE 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
-4292.0 -1570.0  -757.0  -962.9  -274.0     0.0       1 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.1221, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable DAYS_LAST_PHONE_CHANGE NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_HOUR 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.01    0.00    4.00   41519 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.52432, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_REQ_CREDIT_BUREAU_HOUR NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_DAY 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.01    0.00    9.00   41519 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.5196, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_REQ_CREDIT_BUREAU_DAY NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_WEEK 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.03    0.00    8.00   41519 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.53457, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_REQ_CREDIT_BUREAU_WEEK NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_MON 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.27    0.00   27.00   41519 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.45031, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_REQ_CREDIT_BUREAU_MON NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_QRT 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   0.00    0.00    0.00    0.27    0.00  261.00   41519 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.4408, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_REQ_CREDIT_BUREAU_QRT NO sigue una distribución normal (p < 0 )

-------------------------------------------------
Distribución de la variable: AMT_REQ_CREDIT_BUREAU_YEAR 
-------------------------------------------------
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    0.0     0.0     1.0     1.9     3.0    25.0   41519 
Warning in ks.test.default(datos[[col]], "pnorm", mean(datos[[col]], na.rm =
TRUE), : ties should not be present for the one-sample Kolmogorov-Smirnov test


Test de Kolmogorov-Smirnov para la normalidad:

    Asymptotic one-sample Kolmogorov-Smirnov test

data:  datos[[col]]
D = 0.19321, p-value < 2.2e-16
alternative hypothesis: two-sided

❌ La variable AMT_REQ_CREDIT_BUREAU_YEAR NO sigue una distribución normal (p < 0 )
 # Función para imputar valores faltantes con la media
 imputar_mediana <- function(x) {
   if (is.numeric(x)) {  # Verifica si es numérica
     x[is.na(x)] <- median(x, na.rm = TRUE)  # Calcula y reemplaza con la media
   }
   return(x)
 }
numeric_columns <- datos |> select_if(is.numeric) |> names()

 # Aplicar la función a todas las columnas numéricas
datos[numeric_columns] <- lapply(datos[numeric_columns], imputar_mediana)
data.frame(sort(colSums(is.na(datos))))
                             sort.colSums.is.na.datos...
SK_ID_CURR                                             0
TARGET                                                 0
NAME_CONTRACT_TYPE                                     0
CODE_GENDER                                            0
FLAG_OWN_CAR                                           0
FLAG_OWN_REALTY                                        0
CNT_CHILDREN                                           0
AMT_INCOME_TOTAL                                       0
AMT_CREDIT                                             0
AMT_ANNUITY                                            0
AMT_GOODS_PRICE                                        0
NAME_TYPE_SUITE                                        0
NAME_INCOME_TYPE                                       0
NAME_EDUCATION_TYPE                                    0
NAME_FAMILY_STATUS                                     0
NAME_HOUSING_TYPE                                      0
REGION_POPULATION_RELATIVE                             0
DAYS_BIRTH                                             0
DAYS_EMPLOYED                                          0
DAYS_REGISTRATION                                      0
DAYS_ID_PUBLISH                                        0
OWN_CAR_AGE                                            0
FLAG_MOBIL                                             0
FLAG_EMP_PHONE                                         0
FLAG_WORK_PHONE                                        0
FLAG_CONT_MOBILE                                       0
FLAG_PHONE                                             0
FLAG_EMAIL                                             0
OCCUPATION_TYPE                                        0
CNT_FAM_MEMBERS                                        0
REGION_RATING_CLIENT                                   0
REGION_RATING_CLIENT_W_CITY                            0
WEEKDAY_APPR_PROCESS_START                             0
HOUR_APPR_PROCESS_START                                0
REG_REGION_NOT_LIVE_REGION                             0
REG_REGION_NOT_WORK_REGION                             0
LIVE_REGION_NOT_WORK_REGION                            0
REG_CITY_NOT_LIVE_CITY                                 0
REG_CITY_NOT_WORK_CITY                                 0
LIVE_CITY_NOT_WORK_CITY                                0
ORGANIZATION_TYPE                                      0
EXT_SOURCE_1                                           0
EXT_SOURCE_2                                           0
EXT_SOURCE_3                                           0
APARTMENTS_AVG                                         0
BASEMENTAREA_AVG                                       0
YEARS_BEGINEXPLUATATION_AVG                            0
YEARS_BUILD_AVG                                        0
COMMONAREA_AVG                                         0
ELEVATORS_AVG                                          0
ENTRANCES_AVG                                          0
FLOORSMAX_AVG                                          0
FLOORSMIN_AVG                                          0
LANDAREA_AVG                                           0
LIVINGAPARTMENTS_AVG                                   0
LIVINGAREA_AVG                                         0
NONLIVINGAPARTMENTS_AVG                                0
NONLIVINGAREA_AVG                                      0
APARTMENTS_MODE                                        0
BASEMENTAREA_MODE                                      0
YEARS_BEGINEXPLUATATION_MODE                           0
YEARS_BUILD_MODE                                       0
COMMONAREA_MODE                                        0
ELEVATORS_MODE                                         0
ENTRANCES_MODE                                         0
FLOORSMAX_MODE                                         0
FLOORSMIN_MODE                                         0
LANDAREA_MODE                                          0
LIVINGAPARTMENTS_MODE                                  0
LIVINGAREA_MODE                                        0
NONLIVINGAPARTMENTS_MODE                               0
NONLIVINGAREA_MODE                                     0
APARTMENTS_MEDI                                        0
BASEMENTAREA_MEDI                                      0
YEARS_BEGINEXPLUATATION_MEDI                           0
YEARS_BUILD_MEDI                                       0
COMMONAREA_MEDI                                        0
ELEVATORS_MEDI                                         0
ENTRANCES_MEDI                                         0
FLOORSMAX_MEDI                                         0
FLOORSMIN_MEDI                                         0
LANDAREA_MEDI                                          0
LIVINGAPARTMENTS_MEDI                                  0
LIVINGAREA_MEDI                                        0
NONLIVINGAPARTMENTS_MEDI                               0
NONLIVINGAREA_MEDI                                     0
FONDKAPREMONT_MODE                                     0
HOUSETYPE_MODE                                         0
TOTALAREA_MODE                                         0
WALLSMATERIAL_MODE                                     0
EMERGENCYSTATE_MODE                                    0
OBS_30_CNT_SOCIAL_CIRCLE                               0
DEF_30_CNT_SOCIAL_CIRCLE                               0
OBS_60_CNT_SOCIAL_CIRCLE                               0
DEF_60_CNT_SOCIAL_CIRCLE                               0
DAYS_LAST_PHONE_CHANGE                                 0
FLAG_DOCUMENT_2                                        0
FLAG_DOCUMENT_3                                        0
FLAG_DOCUMENT_4                                        0
FLAG_DOCUMENT_5                                        0
FLAG_DOCUMENT_6                                        0
FLAG_DOCUMENT_7                                        0
FLAG_DOCUMENT_8                                        0
FLAG_DOCUMENT_9                                        0
FLAG_DOCUMENT_10                                       0
FLAG_DOCUMENT_11                                       0
FLAG_DOCUMENT_12                                       0
FLAG_DOCUMENT_13                                       0
FLAG_DOCUMENT_14                                       0
FLAG_DOCUMENT_15                                       0
FLAG_DOCUMENT_16                                       0
FLAG_DOCUMENT_17                                       0
FLAG_DOCUMENT_18                                       0
FLAG_DOCUMENT_19                                       0
FLAG_DOCUMENT_20                                       0
FLAG_DOCUMENT_21                                       0
AMT_REQ_CREDIT_BUREAU_HOUR                             0
AMT_REQ_CREDIT_BUREAU_DAY                              0
AMT_REQ_CREDIT_BUREAU_WEEK                             0
AMT_REQ_CREDIT_BUREAU_MON                              0
AMT_REQ_CREDIT_BUREAU_QRT                              0
AMT_REQ_CREDIT_BUREAU_YEAR                             0

Estandarizar valores

Primero pasamos las columnas con dias negativos a positivos

# Lista de columnas con días negativos
date_col <- c("DAYS_BIRTH", "DAYS_EMPLOYED", "DAYS_REGISTRATION", "DAYS_ID_PUBLISH")

# Convertir valores negativos a positivos en todas las columnas de la lista
datos[date_col] <- abs(datos[date_col])

Ahora vamos a organizar a las personas segun su nivel de ingresos (Dicotomizamos)

# Dividir AMT_INCOME_TOTAL por 100,000
datos$AMT_INCOME_TOTAL <- datos$AMT_INCOME_TOTAL / 100000

# Definir los límites de los bins
bins <- c(0,1,2,3,4,5,6,7,8,9,10,11)

# Definir las etiquetas para los rangos de ingresos
slot <- c('0-100K','100K-200K', '200K-300K','300K-400K','400K-500K',
          '500K-600K','600K-700K','700K-800K','800K-900K','900K-1M', '1M Above')

# Crear la nueva variable categórica usando cut()
datos$AMT_INCOME_RANGE <- cut(datos$AMT_INCOME_TOTAL, breaks = bins, labels = slot, include.lowest = TRUE)

# Calcular la frecuencia relativa (%) de cada categoría en AMT_INCOME_RANGE
prop.table(table(datos$AMT_INCOME_RANGE)) * 100

      0-100K    100K-200K    200K-300K    300K-400K    400K-500K    500K-600K 
20.729695163 50.734999788 21.210691261  4.776115517  1.744668526  0.356353672 
   600K-700K    700K-800K    800K-900K      900K-1M     1M Above 
 0.282804878  0.052720817  0.096980269  0.009112240  0.005857869 

Relaizamos lo mismo para la cantida de credito, la edad y las horas trabajadas para facilitar las comparaciones en el futuro

# Dividir AMT_CREDIT por 100,000
datos$AMT_CREDIT <- datos$AMT_CREDIT / 100000

# Definir los límites de los bins
bins <- c(0,1,2,3,4,5,6,7,8,9,10,100)

# Definir las etiquetas para los rangos de crédito
slots <- c('0-100K','100K-200K', '200K-300K','300K-400K','400K-500K',
           '500K-600K','600K-700K','700K-800K','800K-900K','900K-1M', '1M Above')

# Crear la nueva variable categórica
datos$AMT_CREDIT_RANGE <- cut(datos$AMT_CREDIT, breaks = bins, labels = slots, include.lowest = TRUE)

# Calcular la frecuencia relativa (%) de cada categoría en AMT_CREDIT_RANGE
prop.table(table(datos$AMT_CREDIT_RANGE)) * 100

   0-100K 100K-200K 200K-300K 300K-400K 400K-500K 500K-600K 600K-700K 700K-800K 
 1.952450  9.801275 17.824728  8.564897 10.418489 11.131960  7.820533  6.241403 
800K-900K   900K-1M  1M Above 
 7.086576  2.902986 16.254703 
# Crear la variable AGE a partir de DAYS_BIRTH
datos$AGE <- floor(abs(datos$DAYS_BIRTH) / 365)

# Definir los límites de los bins
bins <- c(0, 20, 30, 40, 50, 100)

# Definir las etiquetas para los grupos de edad
slots <- c('0-20', '20-30', '30-40', '40-50', '50 above')

# Crear la nueva variable categórica
datos$AGE_GROUP <- cut(datos$AGE, breaks = bins, labels = slots, include.lowest = TRUE)

# Calcular la frecuencia relativa (%) de cada categoría en AGE_GROUP
prop.table(table(datos$AGE_GROUP)) * 100

        0-20        20-30        30-40        40-50     50 above 
3.251916e-04 1.717174e+01 2.702895e+01 2.419458e+01 3.160440e+01 
datos$AGE <- floor(abs(datos$DAYS_BIRTH) / 365)
# Crear la variable YEARS_EMPLOYED a partir de DAYS_EMPLOYED
datos$YEARS_EMPLOYED <- floor(abs(datos$DAYS_EMPLOYED) / 365)

# Definir los límites de los bins
bins <- c(0, 5, 10, 20, 30, 40, 50, 60, 150)

# Definir las etiquetas para los grupos de años de empleo
slots <- c('0-5', '5-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60 above')

# Crear la nueva variable categórica
datos$EMPLOYMENT_YEAR <- cut(datos$YEARS_EMPLOYED, breaks = bins, labels = slots, include.lowest = TRUE)

# Calcular la frecuencia relativa (%) de cada categoría en EMPLOYMENT_YEAR
prop.table(table(datos$EMPLOYMENT_YEAR)) * 100

        0-5        5-10       10-20       20-30       30-40       40-50 
60.49806256 22.20340529 12.95248218  3.33509164  0.94155162  0.06940671 
      50-60    60 above 
 0.00000000  0.00000000 

Se lleva a cabo esto para poder facilitar la comparacion entre observaciones y la clasificacion de modelos. Viendo la diferencia entre los distintos grupos

L1 PENALTY PARA LA REGRESION USAR apuntaría brevemente en cada caso, que puedes hacer para seguir

#TIPIFICAR TODAS LAS VARIABLES NUMERICAS 

Factorial de variables

Variables economicas

economic_vars <- datos[, c("AMT_INCOME_TOTAL", "AMT_CREDIT", "AMT_ANNUITY", "AMT_GOODS_PRICE","OWN_CAR_AGE","DAYS_EMPLOYED")]
#"CNT_FAM_MEMBERS" "CNT_CHILDREN"

economic_vars_scaled <- scale(economic_vars)
factor_analysis <- factanal(economic_vars_scaled, factors = 2, rotation = "varimax")

print(factor_analysis, digits = 3, cutoff = 0.3, sort = TRUE)

Call:
factanal(x = economic_vars_scaled, factors = 2, rotation = "varimax")

Uniquenesses:
AMT_INCOME_TOTAL       AMT_CREDIT      AMT_ANNUITY  AMT_GOODS_PRICE 
           0.908            0.020            0.328            0.006 
     OWN_CAR_AGE    DAYS_EMPLOYED 
           0.999            0.953 

Loadings:
                 Factor1 Factor2
AMT_CREDIT        0.973         
AMT_ANNUITY       0.717   0.398 
AMT_GOODS_PRICE   0.980         
AMT_INCOME_TOTAL                
OWN_CAR_AGE                     
DAYS_EMPLOYED                   

               Factor1 Factor2
SS loadings      2.436   0.351
Proportion Var   0.406   0.059
Cumulative Var   0.406   0.464

Test of the hypothesis that 2 factors are sufficient.
The chi square statistic is 671.06 on 4 degrees of freedom.
The p-value is 6.43e-144 
print(factor_analysis$loadings)

Loadings:
                 Factor1 Factor2
AMT_INCOME_TOTAL  0.110   0.283 
AMT_CREDIT        0.973   0.182 
AMT_ANNUITY       0.717   0.398 
AMT_GOODS_PRICE   0.980   0.181 
OWN_CAR_AGE                     
DAYS_EMPLOYED            -0.216 

               Factor1 Factor2
SS loadings      2.436   0.351
Proportion Var   0.406   0.059
Cumulative Var   0.406   0.464
print("------------------------- KMO -----------------------------------")
[1] "------------------------- KMO -----------------------------------"
KMO(economic_vars_scaled)  # Índice de adecuación muestral
Kaiser-Meyer-Olkin factor adequacy
Call: KMO(r = economic_vars_scaled)
Overall MSA =  0.7
MSA for each item = 
AMT_INCOME_TOTAL       AMT_CREDIT      AMT_ANNUITY  AMT_GOODS_PRICE 
            0.87             0.63             0.97             0.63 
     OWN_CAR_AGE    DAYS_EMPLOYED 
            0.61             0.70 
cortest.bartlett(economic_vars_scaled)  # Prueba de esfericidad de Bartlett
R was not square, finding R from data
$chisq
[1] 1417942

$p.value
[1] 0

$df
[1] 15
print("------------------------ loadings ------------------------------------")
[1] "------------------------ loadings ------------------------------------"
loadings <- as.data.frame(factor_analysis$loadings[,1:2])
loadings$Variable <- rownames(loadings)
print("-------------------------- ggplot ----------------------------------")
[1] "-------------------------- ggplot ----------------------------------"
pca_result <- prcomp(economic_vars_scaled, scale = TRUE)
screeplot(pca_result, type = "lines", main = "Scree Plot")

ggplot(loadings, aes(x = Factor1, y = Factor2, label = Variable)) +
  geom_text(size = 5) +
  theme_minimal() +
  ggtitle("Carga Factorial de Variables Económicas")

Valores atipicos

# Definir las variables para analizar outliers
app_outlier_col_1 <- c('AMT_ANNUITY', 'AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_GOODS_PRICE', 'DAYS_EMPLOYED')
app_outlier_col_2 <- c('CNT_CHILDREN', 'DAYS_BIRTH')

# Crear boxplots para app_outlier_col_1
plots1 <- lapply(app_outlier_col_1, function(var) {
  ggplot(datos, aes(y = .data[[var]])) +
    geom_boxplot(fill = "lightblue", color = "black") +
    labs(title = var, y = "") +
    theme_minimal()
})

# Crear boxplots para app_outlier_col_2
plots2 <- lapply(app_outlier_col_2, function(var) {
  ggplot(datos, aes(y = .data[[var]])) +
    geom_boxplot(fill = "lightblue", color = "black") +
    labs(title = var, y = "") +
    theme_minimal()
})

# Mostrar todos los gráficos en una sola figura
grid.arrange(grobs = c(plots1, plots2), ncol = 4)

# eliminamos la categoria unknown de NAME_FAMILY_STATUS al no tener ninguna observacion
datos <- datos |> filter(NAME_FAMILY_STATUS != "Unknown")
datos$NAME_FAMILY_STATUS <- droplevels(datos$NAME_FAMILY_STATUS)
#eliminamos la categoria de "60 above" y "50-60" para YEARS_EMPLOYED
datos <- datos[!datos$EMPLOYMENT_YEAR %in% c("50-60", "60 above"), ]
# eliminamos la categoria XNA que tiene 0 observaciones
datos <- datos[datos$CODE_GENDER != "XNA", ]
datos$CODE_GENDER <- droplevels(datos$CODE_GENDER) 
# hemos tenido problemas con las personas que estan desempleadas, hay que asignarlas un valor, por tanto las asignamos al valor "0-5"
datos$EMPLOYMENT_YEAR <- ifelse(
  datos$NAME_INCOME_TYPE == "Unemployed", "0", as.character(datos$EMPLOYMENT_YEAR))
datos$EMPLOYMENT_YEAR <- as.factor(datos$EMPLOYMENT_YEAR)
# aquellas observaciones que ya no se han podido sustituir ya sea por valores atipicos o causen problemas se eliminan 
datos <- na.omit(datos)

Tablas de contingencia

tb_conting <- function(df, x, vec){
  for(i in seq_along(vec)){
    cat("\nTabla de Contingencia para:", vec[i], "\n")
    
    # Crear tabla de contingencia con nombres de filas y columnas
    tab <- table(df[[x]], df[[vec[i]]])
    dimnames(tab) <- list(TARGET = levels(factor(df[[x]])), Variable = levels(factor(df[[vec[i]]])))
    
    print(tab)
    
    cat("\nTest de Chi-Cuadrado:\n")
    chi_test <- chisq.test(tab)
    print(chi_test)
    
    cat("\n--------------------------\n")
  }
}


# Llamada a la función, suponiendo que df es tu base de datos
tb_conting(datos, "TARGET", contact_col)  # Puedes probar con col_Doc o ext también

Tabla de Contingencia para: FLAG_MOBIL 
      Variable
TARGET      0      1
     0      1 230098
     1      0  21832

Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 2.7239e-22, df = 1, p-value = 1


--------------------------

Tabla de Contingencia para: FLAG_EMP_PHONE 
      Variable
TARGET      0      1
     0     25 230074
     1      9  21823

Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 11.463, df = 1, p-value = 0.0007101


--------------------------

Tabla de Contingencia para: FLAG_WORK_PHONE 
      Variable
TARGET      0      1
     0 174752  55347
     1  15931   5901

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 95.784, df = 1, p-value < 2.2e-16


--------------------------

Tabla de Contingencia para: FLAG_CONT_MOBILE 
      Variable
TARGET      0      1
     0    490 229609
     1     43  21789

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 0.17177, df = 1, p-value = 0.6785


--------------------------

Tabla de Contingencia para: FLAG_PHONE 
      Variable
TARGET      0      1
     0 165455  64644
     1  16534   5298

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 145.42, df = 1, p-value < 2.2e-16


--------------------------

Tabla de Contingencia para: FLAG_EMAIL 
      Variable
TARGET      0      1
     0 215396  14703
     1  20550   1282

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 8.9079, df = 1, p-value = 0.002839


--------------------------
tb_conting(datos, "TARGET", col_Doc)  # Puedes probar con col_Doc o ext también

Tabla de Contingencia para: FLAG_DOCUMENT_2 
      Variable
TARGET      0      1
     0 230090      9
     1  21828      4

Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 5.4751, df = 1, p-value = 0.01929


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_3 
      Variable
TARGET      0      1
     0  55752 174347
     1   3938  17894

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 422.5, df = 1, p-value < 2.2e-16


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_4 
      Variable
TARGET      0      1
     0 230079     20
     1  21832      0

Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 0.96074, df = 1, p-value = 0.327


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_5 
      Variable
TARGET      0      1
     0 226355   3744
     1  21483    349

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 0.084646, df = 1, p-value = 0.7711


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_6 
      Variable
TARGET      0      1
     0 228048   2051
     1  21698    134

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 17.548, df = 1, p-value = 2.802e-05


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_7 
      Variable
TARGET      0      1
     0 230052     47
     1  21829      3

Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 0.17534, df = 1, p-value = 0.6754


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_8 
      Variable
TARGET      0      1
     0 207499  22600
     1  20016   1816

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 51.349, df = 1, p-value = 7.732e-13


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_9 
      Variable
TARGET      0      1
     0 229016   1083
     1  21759     73

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 7.8141, df = 1, p-value = 0.005184


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_10 
      Variable
TARGET      0      1
     0 230093      6
     1  21832      0

Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 0.00083827, df = 1, p-value = 0.9769


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_11 
      Variable
TARGET      0      1
     0 228973   1126
     1  21757     75

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 8.6322, df = 1, p-value = 0.003303


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_12 
      Variable
TARGET      0      1
     0 230097      2
     1  21832      0

Test de Chi-Cuadrado:
Warning in chisq.test(tab): Chi-squared approximation may be incorrect

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 5.4479e-22, df = 1, p-value = 1


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_13 
      Variable
TARGET      0      1
     0 229063   1036
     1  21803     29

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 46.973, df = 1, p-value = 7.198e-12


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_14 
      Variable
TARGET      0      1
     0 229244    855
     1  21802     30

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 30.57, df = 1, p-value = 3.221e-08


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_15 
      Variable
TARGET      0      1
     0 229748    351
     1  21821     11

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 13.8, df = 1, p-value = 0.0002033


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_16 
      Variable
TARGET      0      1
     0 227248   2851
     1  21682    150

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 51.147, df = 1, p-value = 8.571e-13


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_17 
      Variable
TARGET      0      1
     0 230020     79
     1  21830      2

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 3.1869, df = 1, p-value = 0.07423


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_18 
      Variable
TARGET      0      1
     0 227768   2331
     1  21690    142

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 26.604, df = 1, p-value = 2.497e-07


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_19 
      Variable
TARGET      0      1
     0 229932    167
     1  21820     12

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 0.64075, df = 1, p-value = 0.4234


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_20 
      Variable
TARGET      0      1
     0 229957    142
     1  21819     13

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 2.9034e-28, df = 1, p-value = 1


--------------------------

Tabla de Contingencia para: FLAG_DOCUMENT_21 
      Variable
TARGET      0      1
     0 230010     89
     1  21818     14

Test de Chi-Cuadrado:

    Pearson's Chi-squared test with Yates' continuity correction

data:  tab
X-squared = 2.5675, df = 1, p-value = 0.1091


--------------------------

Analisis de Datos

En un principio me interesa saber cuales son las variables mas importantes a la hora de predecir si alguien va a devovler el pago o no, por tanto realizamos un modelo con todas las variables y hacemos el ANOVA para ver cuales son las mas significativas

#anova(lm(TARGET~.,data=datos))
anova_results <- anova(lm(TARGET ~ ., data = datos))

# Ordenar por la suma de cuadrados (Sum Sq) en orden descendente
(anova_sorted <- anova_results[order(-anova_results$`Sum Sq`), ])
Analysis of Variance Table

Response: TARGET
                                 Df  Sum Sq Mean Sq   F value    Pr(>F)    
Residuals                    251666 18566.3    0.07                        
EXT_SOURCE_3                      1   324.4  324.44 4397.8180 < 2.2e-16 ***
EXT_SOURCE_2                      1   320.4  320.40 4343.0108 < 2.2e-16 ***
DAYS_BIRTH                        1    61.4   61.44  832.7706 < 2.2e-16 ***
AMT_GOODS_PRICE                   1    57.1   57.05  773.3386 < 2.2e-16 ***
FLAG_OWN_CAR                      1    51.8   51.78  701.8813 < 2.2e-16 ***
EXT_SOURCE_1                      1    49.1   49.13  665.9867 < 2.2e-16 ***
CODE_GENDER                       1    47.0   47.01  637.2217 < 2.2e-16 ***
DAYS_EMPLOYED                     1    42.2   42.17  571.6829 < 2.2e-16 ***
REGION_RATING_CLIENT              2    41.8   20.90  283.3041 < 2.2e-16 ***
NAME_EDUCATION_TYPE               4    39.6    9.89  134.0456 < 2.2e-16 ***
AMT_INCOME_TOTAL                  1    29.3   29.31  397.2466 < 2.2e-16 ***
NAME_INCOME_TYPE                  7    28.2    4.03   54.6094 < 2.2e-16 ***
AMT_CREDIT_RANGE                 10    26.1    2.61   35.3973 < 2.2e-16 ***
NAME_CONTRACT_TYPE                1    26.0   25.97  352.0502 < 2.2e-16 ***
NAME_FAMILY_STATUS                4    23.3    5.81   78.8051 < 2.2e-16 ***
AMT_CREDIT                        1    21.6   21.62  293.1172 < 2.2e-16 ***
ORGANIZATION_TYPE                56    20.9    0.37    5.0688 < 2.2e-16 ***
DAYS_ID_PUBLISH                   1    18.4   18.44  250.0106 < 2.2e-16 ***
OCCUPATION_TYPE                  18    17.3    0.96   12.9905 < 2.2e-16 ***
REGION_POPULATION_RELATIVE        1    14.8   14.80  200.6091 < 2.2e-16 ***
NAME_HOUSING_TYPE                 5    11.8    2.36   32.0557 < 2.2e-16 ***
FLAG_WORK_PHONE                   1    10.0    9.99  135.4386 < 2.2e-16 ***
DEF_30_CNT_SOCIAL_CIRCLE          1     9.9    9.88  133.9102 < 2.2e-16 ***
REG_CITY_NOT_LIVE_CITY            1     8.0    8.05  109.0890 < 2.2e-16 ***
DAYS_REGISTRATION                 1     6.9    6.93   93.8974 < 2.2e-16 ***
REGION_RATING_CLIENT_W_CITY       2     6.7    3.36   45.5136 < 2.2e-16 ***
FLAG_DOCUMENT_3                   1     5.3    5.32   72.1344 < 2.2e-16 ***
AGE_GROUP                         4     4.8    1.20   16.3134 2.285e-13 ***
AMT_ANNUITY                       1     4.7    4.71   63.8411 1.354e-15 ***
EMPLOYMENT_YEAR                   5     4.2    0.85   11.4648 4.349e-11 ***
FLAG_PHONE                        1     3.6    3.58   48.5626 3.207e-12 ***
OWN_CAR_AGE                       1     2.9    2.91   39.3902 3.476e-10 ***
CNT_CHILDREN                      1     2.7    2.70   36.5430 1.495e-09 ***
DAYS_LAST_PHONE_CHANGE            1     2.5    2.55   34.5516 4.156e-09 ***
NAME_TYPE_SUITE                   7     2.5    0.35    4.7570 2.331e-05 ***
FLAG_DOCUMENT_18                  1     2.2    2.19   29.7456 4.931e-08 ***
FLAG_DOCUMENT_16                  1     2.0    2.03   27.5072 1.566e-07 ***
WEEKDAY_APPR_PROCESS_START        6     1.7    0.28    3.7868 0.0008958 ***
REG_CITY_NOT_WORK_CITY            1     1.6    1.59   21.5392 3.468e-06 ***
WALLSMATERIAL_MODE                7     1.5    0.22    2.9613 0.0041933 ** 
HOUR_APPR_PROCESS_START           1     1.2    1.21   16.4369 5.031e-05 ***
AMT_REQ_CREDIT_BUREAU_QRT         1     1.1    1.11   15.0008 0.0001075 ***
APARTMENTS_AVG                    1     1.0    1.04   14.0331 0.0001797 ***
FLOORSMAX_AVG                     1     1.0    0.97   13.1753 0.0002837 ***
FLAG_DOCUMENT_5                   1     0.9    0.93   12.6546 0.0003747 ***
FLAG_DOCUMENT_2                   1     0.9    0.92   12.5059 0.0004058 ***
FONDKAPREMONT_MODE                4     0.9    0.23    3.0499 0.0159292 *  
AMT_INCOME_RANGE                 10     0.9    0.09    1.1889 0.2925494    
OBS_30_CNT_SOCIAL_CIRCLE          1     0.8    0.80   10.8412 0.0009928 ***
YEARS_EMPLOYED                    1     0.6    0.57    7.7489 0.0053749 ** 
AMT_REQ_CREDIT_BUREAU_WEEK        1     0.5    0.52    6.9830 0.0082291 ** 
YEARS_BUILD_AVG                   1     0.5    0.48    6.4516 0.0110859 *  
FLAG_DOCUMENT_14                  1     0.5    0.47    6.4222 0.0112710 *  
FLAG_EMAIL                        1     0.5    0.45    6.1266 0.0133167 *  
EMERGENCYSTATE_MODE               2     0.4    0.22    3.0362 0.0480175 *  
FLAG_DOCUMENT_13                  1     0.4    0.43    5.8133 0.0159061 *  
FLAG_DOCUMENT_8                   1     0.4    0.43    5.7729 0.0162760 *  
FLAG_CONT_MOBILE                  1     0.4    0.42    5.6940 0.0170236 *  
YEARS_BEGINEXPLUATATION_AVG       1     0.4    0.36    4.8943 0.0269458 *  
NONLIVINGAREA_MODE                1     0.3    0.26    3.5766 0.0585992 .  
FLAG_DOCUMENT_15                  1     0.2    0.23    3.1349 0.0766321 .  
AMT_REQ_CREDIT_BUREAU_MON         1     0.2    0.23    3.1288 0.0769197 .  
HOUSETYPE_MODE                    3     0.2    0.07    0.9618 0.4096356    
COMMONAREA_AVG                    1     0.2    0.19    2.5410 0.1109255    
FLAG_DOCUMENT_6                   1     0.2    0.18    2.4017 0.1212067    
FLAG_OWN_REALTY                   1     0.2    0.16    2.2253 0.1357670    
FLAG_DOCUMENT_9                   1     0.2    0.16    2.1759 0.1401869    
AGE                               1     0.1    0.13    1.8259 0.1766090    
ELEVATORS_AVG                     1     0.1    0.13    1.8056 0.1790337    
DEF_60_CNT_SOCIAL_CIRCLE          1     0.1    0.13    1.7604 0.1845784    
FLAG_DOCUMENT_17                  1     0.1    0.13    1.7157 0.1902462    
BASEMENTAREA_AVG                  1     0.1    0.12    1.6208 0.2029832    
LIVINGAPARTMENTS_MODE             1     0.1    0.11    1.4872 0.2226486    
LIVE_REGION_NOT_WORK_REGION       1     0.1    0.10    1.4037 0.2361015    
NONLIVINGAPARTMENTS_MODE          1     0.1    0.10    1.3907 0.2382909    
COMMONAREA_MEDI                   1     0.1    0.10    1.3692 0.2419439    
ENTRANCES_AVG                     1     0.1    0.10    1.3629 0.2430302    
LIVINGAPARTMENTS_MEDI             1     0.1    0.10    1.3367 0.2476193    
LIVE_CITY_NOT_WORK_CITY           1     0.1    0.09    1.2457 0.2643816    
LANDAREA_MODE                     1     0.1    0.09    1.2198 0.2694062    
LANDAREA_MEDI                     1     0.1    0.08    1.0380 0.3082776    
YEARS_BEGINEXPLUATATION_MEDI      1     0.1    0.08    1.0232 0.3117562    
LANDAREA_AVG                      1     0.1    0.07    0.9787 0.3225119    
OBS_60_CNT_SOCIAL_CIRCLE          1     0.1    0.07    0.9415 0.3318868    
FLAG_DOCUMENT_11                  1     0.1    0.06    0.8306 0.3620880    
ENTRANCES_MODE                    1     0.1    0.06    0.8116 0.3676632    
BASEMENTAREA_MEDI                 1     0.1    0.06    0.7902 0.3740473    
FLAG_DOCUMENT_19                  1     0.1    0.05    0.7154 0.3976610    
FLAG_DOCUMENT_10                  1     0.0    0.04    0.5863 0.4438568    
LIVINGAREA_MEDI                   1     0.0    0.04    0.5707 0.4499968    
ELEVATORS_MODE                    1     0.0    0.04    0.5581 0.4550200    
SK_ID_CURR                        1     0.0    0.04    0.5548 0.4563665    
YEARS_BUILD_MEDI                  1     0.0    0.03    0.4680 0.4939106    
FLAG_DOCUMENT_4                   1     0.0    0.03    0.4223 0.5158133    
NONLIVINGAREA_AVG                 1     0.0    0.02    0.3273 0.5672391    
FLAG_DOCUMENT_20                  1     0.0    0.02    0.3222 0.5702855    
LIVINGAREA_AVG                    1     0.0    0.02    0.3196 0.5718629    
NONLIVINGAPARTMENTS_MEDI          1     0.0    0.02    0.2963 0.5861934    
APARTMENTS_MODE                   1     0.0    0.02    0.2934 0.5880228    
FLOORSMAX_MODE                    1     0.0    0.02    0.2931 0.5882637    
FLAG_MOBIL                        1     0.0    0.02    0.2601 0.6100336    
ENTRANCES_MEDI                    1     0.0    0.01    0.2030 0.6522739    
FLOORSMAX_MEDI                    1     0.0    0.01    0.2024 0.6527951    
FLAG_DOCUMENT_7                   1     0.0    0.01    0.1791 0.6721436    
YEARS_BUILD_MODE                  1     0.0    0.01    0.1750 0.6757463    
AMT_REQ_CREDIT_BUREAU_YEAR        1     0.0    0.01    0.1715 0.6787674    
FLAG_DOCUMENT_21                  1     0.0    0.01    0.1643 0.6851913    
FLOORSMIN_AVG                     1     0.0    0.01    0.1481 0.7003682    
LIVINGAREA_MODE                   1     0.0    0.01    0.1242 0.7245029    
TOTALAREA_MODE                    1     0.0    0.01    0.0983 0.7538283    
FLAG_DOCUMENT_12                  1     0.0    0.01    0.0856 0.7698666    
FLAG_EMP_PHONE                    1     0.0    0.01    0.0801 0.7770995    
YEARS_BEGINEXPLUATATION_MODE      1     0.0    0.01    0.0787 0.7790962    
ELEVATORS_MEDI                    1     0.0    0.00    0.0570 0.8112549    
NONLIVINGAREA_MEDI                1     0.0    0.00    0.0412 0.8391866    
FLOORSMIN_MODE                    1     0.0    0.00    0.0403 0.8408954    
APARTMENTS_MEDI                   1     0.0    0.00    0.0268 0.8698642    
REG_REGION_NOT_LIVE_REGION        1     0.0    0.00    0.0207 0.8857061    
LIVINGAPARTMENTS_AVG              1     0.0    0.00    0.0157 0.9003191    
AMT_REQ_CREDIT_BUREAU_HOUR        1     0.0    0.00    0.0137 0.9066605    
FLOORSMIN_MEDI                    1     0.0    0.00    0.0099 0.9207980    
COMMONAREA_MODE                   1     0.0    0.00    0.0069 0.9340282    
BASEMENTAREA_MODE                 1     0.0    0.00    0.0039 0.9504972    
AMT_REQ_CREDIT_BUREAU_DAY         1     0.0    0.00    0.0004 0.9844991    
REG_REGION_NOT_WORK_REGION        1     0.0    0.00    0.0001 0.9924951    
NONLIVINGAPARTMENTS_AVG           1     0.0    0.00    0.0001 0.9926282    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

EXT_SOURCE_3 AMT_GOODS_PRICE FLAG_OWN_CAR EXT_SOURCE_1 CODE_GENDER DAYS_BIRTH NAME_EDUCATION_TYPE DAYS_EMPLOYED AMT_CREDIT NAME_INCOME_TYPE EXT_SOURCE_2 NAME_CONTRACT_TYPE OCCUPATION_TYPE NAME_FAMILY_STATUS AMT_CREDIT_RANGE

# Contar la frecuencia de cada categoría en la variable TARGET
Imbalance <- as.data.frame(table(datos$TARGET))
colnames(Imbalance) <- c("Loan_Repayment_Status", "Count")

# Reemplazar valores 0 y 1 con etiquetas significativas
Imbalance$Loan_Repayment_Status <- factor(Imbalance$Loan_Repayment_Status, 
                                          levels = c(0,1), 
                                          labels = c("Repayer", "Defaulter"))

# Calcular el porcentaje y crear la etiqueta
Imbalance$Percent <- Imbalance$Count / sum(Imbalance$Count) * 100
Imbalance$Label <- paste0(round(Imbalance$Percent, 1), "%")

# Crear el gráfico de barras con etiquetas de porcentaje
ggplot(Imbalance, aes(x = Loan_Repayment_Status, y = Count, fill = Loan_Repayment_Status)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Label), vjust = -0.5) +  # Añadir etiquetas encima de las barras
  scale_fill_manual(values = c("green", "red")) +
  labs(title = "Imbalance Plotting", 
       x = "Loan Repayment Status", 
       y = "Count of Repayers & Defaulters") +
  theme_minimal()

definimos una funcion que dado una variable nos de un histograma con los pagos devueltos y no devueltos segun la variable

# Definir la función
plot_loan_repayment <- function(df, variable) {
  # Verificar que la variable existe
  if (!(variable %in% colnames(df))) {
    stop("La variable especificada no existe en el dataframe.")
  }
  
  # Crear dataframe de trabajo
  df_plot <- df[, c(variable, "TARGET")]
  
  # Convertir TARGET a factor con etiquetas
  df_plot$TARGET <- factor(df_plot$TARGET, levels = c(0, 1), labels = c("Repayer", "Defaulter"))
  
  # Calcular proporciones por categoría
  df_prop <- df_plot %>%
    group_by(.data[[variable]], TARGET) %>%
    summarise(n = n(), .groups = "drop") %>%
    group_by(.data[[variable]]) %>%
    mutate(pct = n / sum(n) * 100)
  
  # Graficar con porcentajes
  ggplot(df_prop, aes_string(x = variable, y = "pct", fill = "TARGET")) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(
      title = paste("Distribución porcentual de", variable, "según estado de pago"),
      x = variable, y = "Porcentaje (%)"
    ) +
    scale_fill_manual(values = c("green", "red")) +
    scale_x_discrete(guide = guide_axis(angle = 45)) +
    theme_minimal()
}

Graficar variables categoricas

# Ejemplo de uso con la variable FLAG_OWN_CAR
plot_loan_repayment(datos, "FLAG_OWN_CAR")
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.

plot_loan_repayment(datos, "CODE_GENDER")

plot_loan_repayment(datos, "NAME_CONTRACT_TYPE")

plot_loan_repayment(datos, "NAME_EDUCATION_TYPE")

plot_loan_repayment(datos, "NAME_INCOME_TYPE")

plot_loan_repayment(datos, "AMT_CREDIT_RANGE")

plot_loan_repayment(datos, "NAME_FAMILY_STATUS")

plot_loan_repayment(datos, "ORGANIZATION_TYPE")

plot_loan_repayment(datos, "OCCUPATION_TYPE")

plot_loan_repayment(datos, "NAME_HOUSING_TYPE")

plot_loan_repayment(datos, "EMPLOYMENT_YEAR")

plot_loan_repayment(datos, "FLAG_DOCUMENT_3")

plot_loan_repayment(datos, "NAME_TYPE_SUITE")

Graficar variables continuas

graficar_variable <- function(data, variable) {
  # Calcular los porcentajes por clase
  porcentajes <- data %>%
    group_by(TARGET) %>%
    summarise(n = n()) %>%
    mutate(porc = paste0(round(100 * n / sum(n), 1), "%"))

  # Crear etiquetas personalizadas
  levels_target <- sort(unique(data$TARGET))
  etiquetas <- paste0(
    ifelse(levels_target == 0, "Repayers", "Defaulters"),
    " (", porcentajes$porc, ")"
  )

  # Graficar con los porcentajes en la leyenda
  ggplot(data, aes(x = .data[[variable]], color = as.factor(TARGET))) +
    geom_density(size = 1) +
    labs(x = variable, y = "Densidad", title = paste("Distribución de", variable, "según TARGET")) +
    scale_color_manual(
      values = c("blue", "red"),
      labels = etiquetas,
      name = "TARGET"
    ) +
    theme_minimal()
}
# Ejemplo de uso con la variable "AMT_CREDIT"
graficar_variable(datos, "AMT_CREDIT")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

# Ejemplo de uso con la variable "AMT_CREDIT"
graficar_variable(datos, "DAYS_BIRTH")

graficar_variable(datos, "AMT_GOODS_PRICE")

graficar_variable(datos, "DAYS_EMPLOYED")

graficar_variable(datos, "DAYS_LAST_PHONE_CHANGE")

graficar_variable(datos, "AMT_INCOME_TOTAL")

graficar_variable(datos, "AGE")

Guardar base de datos depurada para modelos

primero eliminamos las variables menos significativas, y nos quedamos con las mas significativas

variables_significativas <- c("EXT_SOURCE_3", "EXT_SOURCE_2", "DAYS_BIRTH", "AMT_GOODS_PRICE","FLAG_OWN_CAR", "EXT_SOURCE_1", "CODE_GENDER", "NAME_EDUCATION_TYPE", "DAYS_EMPLOYED", "REGION_RATING_CLIENT", "AMT_CREDIT", "NAME_INCOME_TYPE", "NAME_CONTRACT_TYPE", "AMT_CREDIT_RANGE","REGION_POPULATION_RELATIVE", "NAME_HOUSING_TYPE", "FLAG_WORK_PHONE","DEF_30_CNT_SOCIAL_CIRCLE", "REG_CITY_NOT_LIVE_CITY", "DAYS_REGISTRATION", "REGION_RATING_CLIENT_W_CITY", "FLAG_DOCUMENT_3", "AGE_GROUP", "EMPLOYMENT_YEAR", "FLAG_PHONE", "OWN_CAR_AGE", "CNT_CHILDREN",  "DAYS_LAST_PHONE_CHANGE", "FLAG_DOCUMENT_18", "NAME_TYPE_SUITE",  "FLAG_DOCUMENT_16", "WEEKDAY_APPR_PROCESS_START", "REG_CITY_NOT_WORK_CITY",  "AMT_ANNUITY", "WALLSMATERIAL_MODE", "AMT_INCOME_TOTAL",  "HOUR_APPR_PROCESS_START", "AMT_REQ_CREDIT_BUREAU_QRT", "APARTMENTS_AVG",  "FLOORSMAX_AVG", "FLAG_DOCUMENT_5", "FLAG_DOCUMENT_2", "FONDKAPREMONT_MODE", "OBS_30_CNT_SOCIAL_CIRCLE", "YEARS_EMPLOYED","TARGET")
datos<- datos[,variables_significativas]
# eliminamos los NA faltantes, estos se deben a valores atipicos que dan problemas 
#guardamos en una base de datos los datos, asi podemos seguir con el TFG sin saturar el PC
save(datos,file="DatosDepurados.RDa")